Preparations

Load the necessary libraries

library(mgcv)      #for GAMs
library(gratia)    #for GAM plots
library(emmeans)   #for marginal means etc
library(broom)     #for tidy output
library(MuMIn)     #for model selection and AICc
library(lubridate) #for processing dates
library(mapdata)
library(maps)
library(tidyverse) #for data wrangling
library(DHARMa)    #for residual diagnostics
library(performance)
library(see)
library(sf)
library(stars)
library(rnaturalearth)
library(rnaturalearthdata)
library(raster)
library(ggspatial)
library(patchwork)
theme_set(theme_classic())

Scenario

Paruelo and Lauenroth (1996) analyzed the geographic distribution and the effects of climate variables on the relative abundance of a number of plant functional types (PFT’s) including shrubs, forbs, succulents (e.g. cacti), C3 grasses and C4 grasses. They used data from 73 sites across temperate central North America (see pareulo.csv) and calculated the relative abundance of C3 grasses at each site as a response variable

grass

Format of paruelo.csv data file

c3 lat long map mat jjamap djfmap
... ... ... ... ... ... ...
c3 - Relative abundance of c3 grasses at each site - response variable
lat - Latitudinal coordinate
long - Longitudinal coordinate
map - Mean annual precipitation
mat - Mean annual temperature
jjamap - Mean annual precipitation in June, July, August
djfmap - Mean annual precipitation in December, January, February

Read in the data

paruelo <- read_csv('../data/paruelo.csv', trim_ws=TRUE) %>%
  janitor::clean_names()
glimpse(paruelo)
## Rows: 73
## Columns: 7
## $ c3     <dbl> 0.65, 0.65, 0.76, 0.75, 0.33, 0.03, 0.00, 0.02, 0.05, 0.05, 0.…
## $ lat    <dbl> 46.40, 47.32, 45.78, 43.95, 46.90, 38.87, 32.62, 36.95, 35.30,…
## $ long   <dbl> 119.55, 114.27, 110.78, 101.87, 102.82, 99.38, 106.75, 96.55, …
## $ map    <dbl> 199, 469, 536, 476, 484, 623, 259, 969, 542, 421, 446, 376, 66…
## $ mat    <dbl> 12.4, 7.5, 7.2, 8.2, 4.8, 12.0, 14.5, 15.3, 13.9, 8.5, 5.1, 11…
## $ jjamap <dbl> 0.12, 0.24, 0.24, 0.35, 0.40, 0.40, 0.47, 0.30, 0.44, 0.31, 0.…
## $ djfmap <dbl> 0.45, 0.29, 0.20, 0.15, 0.14, 0.11, 0.17, 0.14, 0.13, 0.14, 0.…
canada <- ne_countries(country = c("united states of america", "canada"), scale = "medium", returnclass = "sf") %>%
  st_crop(xmin = -130, xmax = -60, 
          ymin = 0, ymax = 80)
glimpse(canada)
## Rows: 2
## Columns: 64
## $ scalerank  <int> 1, 1
## $ featurecla <chr> "Admin-0 country", "Admin-0 country"
## $ labelrank  <dbl> 2, 2
## $ sovereignt <chr> "Canada", "United States of America"
## $ sov_a3     <chr> "CAN", "US1"
## $ adm0_dif   <dbl> 0, 1
## $ level      <dbl> 2, 2
## $ type       <chr> "Sovereign country", "Country"
## $ admin      <chr> "Canada", "United States of America"
## $ adm0_a3    <chr> "CAN", "USA"
## $ geou_dif   <dbl> 0, 0
## $ geounit    <chr> "Canada", "United States of America"
## $ gu_a3      <chr> "CAN", "USA"
## $ su_dif     <dbl> 0, 0
## $ subunit    <chr> "Canada", "United States of America"
## $ su_a3      <chr> "CAN", "USA"
## $ brk_diff   <dbl> 0, 0
## $ name       <chr> "Canada", "United States"
## $ name_long  <chr> "Canada", "United States"
## $ brk_a3     <chr> "CAN", "USA"
## $ brk_name   <chr> "Canada", "United States"
## $ brk_group  <chr> NA, NA
## $ abbrev     <chr> "Can.", "U.S.A."
## $ postal     <chr> "CA", "US"
## $ formal_en  <chr> "Canada", "United States of America"
## $ formal_fr  <chr> NA, NA
## $ note_adm0  <chr> NA, NA
## $ note_brk   <chr> NA, NA
## $ name_sort  <chr> "Canada", "United States of America"
## $ name_alt   <chr> NA, NA
## $ mapcolor7  <dbl> 6, 4
## $ mapcolor8  <dbl> 6, 5
## $ mapcolor9  <dbl> 2, 1
## $ mapcolor13 <dbl> 2, 1
## $ pop_est    <dbl> 33487208, 313973000
## $ gdp_md_est <dbl> 1300000, 15094000
## $ pop_year   <dbl> NA, 0
## $ lastcensus <dbl> 2011, 2010
## $ gdp_year   <dbl> NA, 0
## $ economy    <chr> "1. Developed region: G7", "1. Developed region: G7"
## $ income_grp <chr> "1. High income: OECD", "1. High income: OECD"
## $ wikipedia  <dbl> NA, 0
## $ fips_10    <chr> NA, NA
## $ iso_a2     <chr> "CA", "US"
## $ iso_a3     <chr> "CAN", "USA"
## $ iso_n3     <chr> "124", "840"
## $ un_a3      <chr> "124", "840"
## $ wb_a2      <chr> "CA", "US"
## $ wb_a3      <chr> "CAN", "USA"
## $ woe_id     <dbl> NA, NA
## $ adm0_a3_is <chr> "CAN", "USA"
## $ adm0_a3_us <chr> "CAN", "USA"
## $ adm0_a3_un <dbl> NA, NA
## $ adm0_a3_wb <dbl> NA, NA
## $ continent  <chr> "North America", "North America"
## $ region_un  <chr> "Americas", "Americas"
## $ subregion  <chr> "Northern America", "Northern America"
## $ region_wb  <chr> "North America", "North America"
## $ name_len   <dbl> 6, 13
## $ long_len   <dbl> 6, 13
## $ abbrev_len <dbl> 4, 6
## $ tiny       <dbl> NA, NA
## $ homepart   <dbl> 1, 1
## $ geometry   <MULTIPOLYGON [°]> MULTIPOLYGON (((-60 43.9057..., MULTIPOLYGON …
spain <- ne_countries(country = c("spain", "italy"), scale = "medium", returnclass = "sf")

Exploratory data analysis

ggplot() +
  geom_sf(data = canada)

The map is with a spatial features mapping, so it knows what the shape and coordinate reference system is from the sf object.

paruelo %>% head()

Note that the C3 data is % cover. To do percent cover, we need to ensure we do not have 0 or 100% cover ever. We also need to ensure that, when projecting onto a map, we have the same coordinate system.

paruelo <- paruelo %>%
  mutate(c3nz = ifelse(c3 == 0, 0.01, c3),
         long = -long)
paruelo_sf <- paruelo %>%
  st_as_sf(coords = c("long", "lat"), crs = st_crs(canada))
ggplot() +
  geom_sf(data = canada, aes(fill = adm0_a3_is), color = "white") +
  scale_fill_manual(values = c("red", "light blue"), guide = "none") +
  geom_sf(data = paruelo_sf, aes(col = c3, size = c3), alpha = 0.9) +
  scale_color_viridis_c() +
  coord_sf(expand = FALSE)

ggplot(paruelo) + 
  geom_histogram(aes(x = c3))

We will focus on the spatial components.

Model formula: \[ y_i \sim{} \mathcal{N}(\mu_i, \sigma^2)\\ \mu_i =\beta_0 + f(Long_i) + f(Lat_i) + f(Long_i, Lat_i) \]

where \(\beta_0\) is the y-intercept. \(f(Lat)\) and \(f(Long)\) indicate the additive smoothing functions of the spatial predictors.

Fit the model

s smooth using lat and long (must be in same scale)

paruelo_gam1 <- gam(c3nz ~ s(long, lat), data = paruelo,
                    family = betar, method = "REML")

Note that we can combine the two smoother terms together because the two are on the same scale. This is only possible because the projection is to a flat, Mercator projection surface.

k.check(paruelo_gam1)
##             k'      edf  k-index p-value
## s(long,lat) 29 4.848145 0.898749  0.1025

Looks like it is ok for not being overconstrained.

simulateResiduals(paruelo_gam1, plot = T)

We are unable to get the residuals because of the model family, but DHARMa has its own function for creating DHARMa-related functions from the model we select!

sim_paruelo_resids <- createDHARMa(
  simulatedResponse = simulate(paruelo_gam1, nsim = 250),
  observedResponse = paruelo$c3nz,
  fittedPredictedResponse = predict(paruelo_gam1))

plot(sim_paruelo_resids)

Looks great!

te tensor products for two with different scales

Another way of modelling is using tensor product smooths: bs = "te". These are especially useful when the two terms are on different scales.

paruelo_gam2 <- gam(c3nz ~ te(long, lat), data = paruelo,
                    family = betar, method = "REML")
createDHARMa(
  simulatedResponse = simulate(paruelo_gam2, nsim = 250),
  observedResponse = paruelo$c3nz,
  fittedPredictedResponse = predict(paruelo_gam2)) %>% plot

k.check(paruelo_gam2)
##              k'     edf   k-index p-value
## te(long,lat) 24 3.00004 0.7814077   0.015

All looks good! Note that edf is way down now!

However, the output of both of these models is a 2D smoother. If we want to split them both up, we use a tensor product interaction: bs = "ti", to see the differences for each component.

ti for tensor interactions

paruelo_gam3 <- gam(c3nz ~ ti(long) + ti(lat) + ti(long, lat), data = paruelo,
                    family = betar, method = "REML")
createDHARMa(
  simulatedResponse = simulate(paruelo_gam3, nsim = 250),
  observedResponse = paruelo$c3nz,
  fittedPredictedResponse = predict(paruelo_gam3)) %>% plot

k.check(paruelo_gam3)
##              k'      edf   k-index p-value
## ti(long)      4 1.000019 0.8673167  0.1250
## ti(lat)       4 1.000024 0.9588562  0.3400
## ti(long,lat) 16 3.523281 0.9956490  0.3775

All looks good!

Partial plots

Using gratia::draw()

draw(paruelo_gam1) # thin plate spline

draw(paruelo_gam2) # tensor product

draw(paruelo_gam3) # tensor product interaction

In the third plot, it shows the effect of long when using the average lat, the effect of lat when using the average long, and the effect of the two combined together!

Using mgcv::vis.gam()

par(mfrow = c(2,3))
vis.gam(paruelo_gam1, theta=30)
vis.gam(paruelo_gam2, theta=30)
vis.gam(paruelo_gam3, theta=30)
vis.gam(paruelo_gam1, theta=-30)
vis.gam(paruelo_gam2, theta=-30)
vis.gam(paruelo_gam3, theta=-30)

par(mfrow = c(1,1))

Model investigation / hypothesis testing

summary(paruelo_gam1)
## 
## Family: Beta regression(4.304) 
## Link function: logit 
## 
## Formula:
## c3nz ~ s(long, lat)
## 
## Parametric coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.1021     0.1081  -10.19   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##               edf Ref.df Chi.sq p-value    
## s(long,lat) 4.848  6.815  59.88  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.455   Deviance explained = 56.3%
## -REML = -45.112  Scale est. = 1         n = 73

Says that there is a certain degree of wiggliness

Conclusions:

  • in the very center of the sampling domain (average longitude and latitude), the expected percentage cover of C3 grasses is the intercept = -1.1 (link scale). If we back-transform to the response scale, this is 24.93%
  • there is evidence that the abundance of c3 grasses varies non-linearly over the spatial extent of the sampling domain.
  • the model explains 56.28% of the total deviance.
tidy(paruelo_gam1)

te(long,lat)

summary(paruelo_gam2)
## 
## Family: Beta regression(4.366) 
## Link function: logit 
## 
## Formula:
## c3nz ~ te(long, lat)
## 
## Parametric coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.1120     0.1077  -10.32   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##              edf Ref.df Chi.sq p-value    
## te(long,lat)   3      3  62.04  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.466   Deviance explained = 55.4%
## -REML = -52.794  Scale est. = 1         n = 73

Conclusions:

  • in the very center of the sampling domain (average longitude and latitude), the expected percentage cover of c3 grasses is -1.11 (link scale). If we back-transform to the response scale, this is 24.75%
  • there is evidence that the abundance of c3 grases varies non-linearly over the spatial extent of the sampling domain.
  • the model explains 55.42% of the total deviance.
tidy(paruelo_gam2)

ti(long,lat)

summary(paruelo_gam3)
## 
## Family: Beta regression(5.099) 
## Link function: logit 
## 
## Formula:
## c3nz ~ ti(long) + ti(lat) + ti(long, lat)
## 
## Parametric coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.2330     0.1151  -10.71   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                edf Ref.df Chi.sq p-value    
## ti(long)     1.000  1.000  0.378 0.53843    
## ti(lat)      1.000  1.000 73.604 < 2e-16 ***
## ti(long,lat) 3.523  3.875 15.070 0.00298 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.535   Deviance explained = 63.6%
## -REML =  -52.2  Scale est. = 1         n = 73

Clear evidence of an interaction between long and lat affecting the values.

Conclusions:

  • in the very center of the sampling domain (average longitude and latitude), the expected percentage cover of c3 grasses is -1.23 (link scale). If we back-transform to the response scale, this is 22.57%
  • at the average latitude, there is no evidence of a longitudinal shift in c3 percentage cover.
  • at the average longitude, there is evidence that c3 percentage cover varies non-linearly from north to south.
  • there is evidence that the abundance of c3 grases varies non-linearly over the spatial extent of the sampling domain.
  • the model explains 63.62% of the total deviance.
tidy(paruelo_gam3)

Summary figures

paruelo_list <- with(paruelo,
                    list(lat = modelr::seq_range(lat, n=100),
                         long = modelr::seq_range(long, n=100)))
newdata <- 
  emmeans(paruelo_gam3, ~long+lat, at = paruelo_list, type='response') %>%
  as.data.frame %>% 
  rename(c3 = response, lwr = lower.CL, upr = upper.CL)

newdata %>% head
newdata %>%
  ggplot(aes(y = lat, x = long)) +
  geom_tile(aes(fill = c3)) +
  geom_contour(aes(z = c3)) +
  scale_fill_gradientn(colors = heat.colors(10)) +
  geom_point(data = paruelo, aes(fill = c3), shape = 21, size = 5) +
  coord_equal()

newdata_sf <- 
  newdata %>%
  dplyr::select(long, lat, c3) %>%
  rasterFromXYZ() %>% 
  mask(canada) %>%
  st_as_stars() %>%
  st_set_crs(st_crs(canada))
## OR
#newdata.sf <- newdata %>% 
#  st_as_sf(coords=c("long", "lat"),  crs=st_crs(canada)) %>%
#  st_rasterize()
ggplot() +
  geom_sf(data=canada) +
  geom_stars(data=newdata_sf) +
  scale_color_viridis_c() +
  coord_sf(expand = FALSE) +
  geom_sf(data = paruelo_sf, aes(col = c3, size = c3), alpha = 0.9)

ggplot() +
  geom_sf(data = canada) +
  geom_stars(data = newdata_sf) +
  scale_fill_viridis_c() + 
  geom_sf(data=paruelo_sf, aes(fill = c3), shape = 21,  size = 4) +
  annotation_scale(location = "bl", width_hint = 0.25) +
  annotation_north_arrow(location = "bl", which_north = "true", 
        pad_x = unit(0.25, "in"), pad_y = unit(0.2, "in"),
        style = north_arrow_fancy_orienteering) +
  coord_sf(expand=FALSE, ylim = c(20, 60)) +
  theme(axis.title=element_blank(),
        legend.position=c(0.99, 0),  legend.justification=c(1, 0))

Paruelo, J. M., and W. K. Lauenroth. 1996. “Relative Abundance of Plant Functional Types in Grasslands and Shrublands of North America.” Ecologicalapplications, no. 6: 1212–24.